import pandas as pd

# numpy: numerical pyton
import numpy as np
## import panguins
penguins = pd.read_csv("data/penguins.csv")
# summarise dataframe
penguins.describe() # just number columns
# summarise dataframe all columns
penguins.describe(include='all')
# average, mean
penguins['bill_length_mm'].mean()
43.9219298245614
# std
penguins['bill_length_mm'].std()
5.4595837139265315
# median
penguins['bill_length_mm'].median()
44.45
# group by + sum / mean
# penguins[ penguins['species'] == 'Adelie' ]['bill_length_mm'].mean() # slowly and does not work

#####
# penguins.groupby('species')['bill_length_mm'].mean()
penguins.groupby('species')['bill_length_mm'].median()
# group by aggrate function
penguins.groupby('species')['bill_length_mm'].agg([ 'min''mean''median''std''max' ])
# group by more than one columns
penguins.groupby([ 'island''species' ])['bill_length_mm'].agg([ 'min''mean''max' ])
# group by more than one columns then change display and import to .csv
result = penguins.groupby([ 'island''species' ])['bill_length_mm'].agg([ 'min''mean''max' ]).reset_index()

result.to_csv('result.csv')

result
# if your code is long ~> use \ for new line
penguins.groupby([ 'island''species' ])['bill_length_mm'] \
    .agg([ 'min''mean''max' ]) \
    .reset_index()
# map values MALE: m, FEMALE: f
# penguins['sex'].head()

penguins['sex_new'] = penguins['sex'].map( { 'MALE''m''FEMALE''f' } ).fillna('other')

penguins.head()
# pandas style
penguins['bill_length_mm'].mean()
43.9219298245614
# numpy style
np.mean(penguins['bill_length_mm'])
43.9219298245614
# other functions of numpy
print( np.sum(penguins['bill_depth_mm']) )
print( np.std(penguins['body_mass_g']) )
5865.700000000001
800.7812292384522
# condition
score = pd.Series( [ 8055629520] )
print( score ) 
0 80
1 55
2 62
3 95
4 20
dtype: int64
grade = np.where( score >= 80'passed''failed' )

print( grade )
['passed' 'failed' 'failed' 'passed' 'failed']
df = penguins.query("species == 'Adelie'")[ ['species''island''bill_length_mm'] ].dropna()
df.head()
df[ 'new_column' ] = np.where(df['bill_length_mm'] > 40TrueFalse# boolean
df.head(10)
# merge dataframe
left = {
    'key': [ 1234 ],
    'name': [ 'toy''joe''jane''anna' ],
    'age': [ 25 ,283022 ]
}

right = {
    'key': [ 1234 ],
    'city': [ 'Bangkok''London''Seoul''Tokyo' ],
    'zip': [ 1001250420949802 ]
}

df_left = pd.DataFrame(left)
df_right = pd.DataFrame(right)
df_left
df_right
df_result = pd.merge(df_left, df_right, on='key')

df_result
# histrogram one column
penguins['body_mass_g'].plot(kind='hist'); # hide <Axes: ylabel='Frequency'> ~> use semicolon
# histrogram two columns
penguins[ ['body_mass_g''bill_length_mm'] ].plot(kind='hist'bins=30);
penguins[ ['bill_length_mm'] ].plot(kind='hist'bins=30color='orange');
# bar plots for species
penguins['species'].value_counts().plot(kind='bar'color=['salmon''orange''gold']);
# scatter plot
penguins[ ['bill_length_mm''bill_depth_mm'] ] \
    .plot(x='bill_length_mm'y='bill_depth_mm'kind='scatter'color='orange');
# datalore visualization ~> select tab visualize
penguins